import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
Data=pd.read_csv("/Users/mac/Downloads/online fraud payment.csv")
Data.head()
| step | type | amount | nameOrig | oldbalanceOrg | newbalanceOrig | nameDest | oldbalanceDest | newbalanceDest | isFraud | isFlaggedFraud | |
|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1 | PAYMENT | 9839.64 | C1231006815 | 170136.0 | 160296.36 | M1979787155 | 0.0 | 0.0 | 0 | 0 |
| 1 | 1 | PAYMENT | 1864.28 | C1666544295 | 21249.0 | 19384.72 | M2044282225 | 0.0 | 0.0 | 0 | 0 |
| 2 | 1 | TRANSFER | 181.00 | C1305486145 | 181.0 | 0.00 | C553264065 | 0.0 | 0.0 | 1 | 0 |
| 3 | 1 | CASH_OUT | 181.00 | C840083671 | 181.0 | 0.00 | C38997010 | 21182.0 | 0.0 | 1 | 0 |
| 4 | 1 | PAYMENT | 11668.14 | C2048537720 | 41554.0 | 29885.86 | M1230701703 | 0.0 | 0.0 | 0 | 0 |
Data.shape
(6362620, 11)
Data.tail()
| step | type | amount | nameOrig | oldbalanceOrg | newbalanceOrig | nameDest | oldbalanceDest | newbalanceDest | isFraud | isFlaggedFraud | |
|---|---|---|---|---|---|---|---|---|---|---|---|
| 6362615 | 743 | CASH_OUT | 339682.13 | C786484425 | 339682.13 | 0.0 | C776919290 | 0.00 | 339682.13 | 1 | 0 |
| 6362616 | 743 | TRANSFER | 6311409.28 | C1529008245 | 6311409.28 | 0.0 | C1881841831 | 0.00 | 0.00 | 1 | 0 |
| 6362617 | 743 | CASH_OUT | 6311409.28 | C1162922333 | 6311409.28 | 0.0 | C1365125890 | 68488.84 | 6379898.11 | 1 | 0 |
| 6362618 | 743 | TRANSFER | 850002.52 | C1685995037 | 850002.52 | 0.0 | C2080388513 | 0.00 | 0.00 | 1 | 0 |
| 6362619 | 743 | CASH_OUT | 850002.52 | C1280323807 | 850002.52 | 0.0 | C873221189 | 6510099.11 | 7360101.63 | 1 | 0 |
Data.isna().sum()
step 0 type 0 amount 0 nameOrig 0 oldbalanceOrg 0 newbalanceOrig 0 nameDest 0 oldbalanceDest 0 newbalanceDest 0 isFraud 0 isFlaggedFraud 0 dtype: int64
Data.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 6362620 entries, 0 to 6362619 Data columns (total 11 columns): # Column Dtype --- ------ ----- 0 step int64 1 type object 2 amount float64 3 nameOrig object 4 oldbalanceOrg float64 5 newbalanceOrig float64 6 nameDest object 7 oldbalanceDest float64 8 newbalanceDest float64 9 isFraud int64 10 isFlaggedFraud int64 dtypes: float64(5), int64(3), object(3) memory usage: 534.0+ MB
so given dataset contains no null and nan values
before moving forward we need to find type transaction on given data set
Data.type.value_counts()
CASH_OUT 2237500 PAYMENT 2151495 CASH_IN 1399284 TRANSFER 532909 DEBIT 41432 Name: type, dtype: int64
Data['step'].duplicated().value_counts()
True 6361877 False 743 Name: step, dtype: int64
Data.drop_duplicates()
Data.head()
| step | type | amount | nameOrig | oldbalanceOrg | newbalanceOrig | nameDest | oldbalanceDest | newbalanceDest | isFraud | isFlaggedFraud | |
|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1 | PAYMENT | 9839.64 | C1231006815 | 170136.0 | 160296.36 | M1979787155 | 0.0 | 0.0 | 0 | 0 |
| 1 | 1 | PAYMENT | 1864.28 | C1666544295 | 21249.0 | 19384.72 | M2044282225 | 0.0 | 0.0 | 0 | 0 |
| 2 | 1 | TRANSFER | 181.00 | C1305486145 | 181.0 | 0.00 | C553264065 | 0.0 | 0.0 | 1 | 0 |
| 3 | 1 | CASH_OUT | 181.00 | C840083671 | 181.0 | 0.00 | C38997010 | 21182.0 | 0.0 | 1 | 0 |
| 4 | 1 | PAYMENT | 11668.14 | C2048537720 | 41554.0 | 29885.86 | M1230701703 | 0.0 | 0.0 | 0 | 0 |
Data.shape
(6362620, 11)
TYPE=Data["type"].value_counts()
transactions=TYPE.index
quantity=TYPE.values
import plotly.express as px
figure=px.pie(Data,
values=quantity,
names=transactions,
title='Distribution of Transaction Type')
figure.show()
correlation=Data.corr()
print(correlation['isFraud'].sort_values(ascending=False))
isFraud 1.000000 amount 0.076688 isFlaggedFraud 0.044109 step 0.031578 oldbalanceOrg 0.010154 newbalanceDest 0.000535 oldbalanceDest -0.005885 newbalanceOrig -0.008148 Name: isFraud, dtype: float64
from seaborn import heatmap
heatmap(Data.corr())
<AxesSubplot:>
Data["type"] = Data["type"].replace({"CASH_OUT" : 1, "PAYMENT" : 2, "CASH_IN" : 3,"TRANSFER": 4,"DEBIT": 5})
#in this step we changeing VALUES on 'type' COLUMN OF DATASET that which we are replacing the values by
#1 AS "CASH_OUT"
#2 AS "PAYMENT"
#3 AS "CASH_IN"
#4 AS "TRANSFER"
#5 AS "DEBIT"
Data["isFraud"] = Data["isFraud"].replace({0: "NO Fraud", 1: "Fraud"})
# same like previous set we replacing the values like in "isfraud"
# 0 as "NO FRAUD"
# 1 as "FRAUD"
Data.head(10)
# why we use head again because we are checking the valuse we replace are correct or not
#check the column of 'type' and 'isfraud'
| step | type | amount | nameOrig | oldbalanceOrg | newbalanceOrig | nameDest | oldbalanceDest | newbalanceDest | isFraud | isFlaggedFraud | |
|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1 | 2 | 9839.64 | C1231006815 | 170136.00 | 160296.36 | M1979787155 | 0.0 | 0.00 | NO Fraud | 0 |
| 1 | 1 | 2 | 1864.28 | C1666544295 | 21249.00 | 19384.72 | M2044282225 | 0.0 | 0.00 | NO Fraud | 0 |
| 2 | 1 | 4 | 181.00 | C1305486145 | 181.00 | 0.00 | C553264065 | 0.0 | 0.00 | Fraud | 0 |
| 3 | 1 | 1 | 181.00 | C840083671 | 181.00 | 0.00 | C38997010 | 21182.0 | 0.00 | Fraud | 0 |
| 4 | 1 | 2 | 11668.14 | C2048537720 | 41554.00 | 29885.86 | M1230701703 | 0.0 | 0.00 | NO Fraud | 0 |
| 5 | 1 | 2 | 7817.71 | C90045638 | 53860.00 | 46042.29 | M573487274 | 0.0 | 0.00 | NO Fraud | 0 |
| 6 | 1 | 2 | 7107.77 | C154988899 | 183195.00 | 176087.23 | M408069119 | 0.0 | 0.00 | NO Fraud | 0 |
| 7 | 1 | 2 | 7861.64 | C1912850431 | 176087.23 | 168225.59 | M633326333 | 0.0 | 0.00 | NO Fraud | 0 |
| 8 | 1 | 2 | 4024.36 | C1265012928 | 2671.00 | 0.00 | M1176932104 | 0.0 | 0.00 | NO Fraud | 0 |
| 9 | 1 | 5 | 5337.77 | C712410124 | 41720.00 | 36382.23 | C195600860 | 41898.0 | 40348.79 | NO Fraud | 0 |
from sklearn.model_selection import train_test_split
x = np.array(Data[["step","type", "amount", "oldbalanceOrg", "newbalanceOrig"]])
y = np.array(Data[["isFraud"]])
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.2,random_state=42)
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
model1=DecisionTreeClassifier()
model2=LogisticRegression()
model3=KNeighborsClassifier()
model4=RandomForestClassifier(n_estimators=150,criterion='entropy')
model1.fit(x_train, y_train.ravel())
DecisionTreeClassifier()
model2.fit(x_train, y_train.ravel())
LogisticRegression()
model3.fit(x_train, y_train.ravel())
KNeighborsClassifier()
model4.fit(x_train, y_train.ravel())
RandomForestClassifier(criterion='entropy', n_estimators=150)
print('DecisionTreeClassifier:',model1.score(x_test, y_test))
print('LogisticRegression',model2.score(x_test, y_test))
print('KNeighborsClassifier',model3.score(x_test, y_test))
print('RandomForestClassifier',model4.score(x_test, y_test))
DecisionTreeClassifier: 0.9996000075440621 LogisticRegression 0.9981509189610569 KNeighborsClassifier 0.9996314411358843 RandomForestClassifier 0.9996793773634132
Data.head()
| step | type | amount | nameOrig | oldbalanceOrg | newbalanceOrig | nameDest | oldbalanceDest | newbalanceDest | isFraud | isFlaggedFraud | |
|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1 | 2 | 9839.64 | C1231006815 | 170136.0 | 160296.36 | M1979787155 | 0.0 | 0.0 | NO Fraud | 0 |
| 1 | 1 | 2 | 1864.28 | C1666544295 | 21249.0 | 19384.72 | M2044282225 | 0.0 | 0.0 | NO Fraud | 0 |
| 2 | 1 | 4 | 181.00 | C1305486145 | 181.0 | 0.00 | C553264065 | 0.0 | 0.0 | Fraud | 0 |
| 3 | 1 | 1 | 181.00 | C840083671 | 181.0 | 0.00 | C38997010 | 21182.0 | 0.0 | Fraud | 0 |
| 4 | 1 | 2 | 11668.14 | C2048537720 | 41554.0 | 29885.86 | M1230701703 | 0.0 | 0.0 | NO Fraud | 0 |
Data.tail()
| step | type | amount | nameOrig | oldbalanceOrg | newbalanceOrig | nameDest | oldbalanceDest | newbalanceDest | isFraud | isFlaggedFraud | |
|---|---|---|---|---|---|---|---|---|---|---|---|
| 6362615 | 743 | 1 | 339682.13 | C786484425 | 339682.13 | 0.0 | C776919290 | 0.00 | 339682.13 | Fraud | 0 |
| 6362616 | 743 | 4 | 6311409.28 | C1529008245 | 6311409.28 | 0.0 | C1881841831 | 0.00 | 0.00 | Fraud | 0 |
| 6362617 | 743 | 1 | 6311409.28 | C1162922333 | 6311409.28 | 0.0 | C1365125890 | 68488.84 | 6379898.11 | Fraud | 0 |
| 6362618 | 743 | 4 | 850002.52 | C1685995037 | 850002.52 | 0.0 | C2080388513 | 0.00 | 0.00 | Fraud | 0 |
| 6362619 | 743 | 1 | 850002.52 | C1280323807 | 850002.52 | 0.0 | C873221189 | 6510099.11 | 7360101.63 | Fraud | 0 |
features = np.array([[ 1, 4, 181.00, 181.00, 0.0]])
print(model3.predict(features))
['Fraud']